1.0 Batch Correction Example

In [1]:
import numpy as np
import pandas as pd
import scanpy as sc

sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_versions()
results_file = './write/pbmc3k.h5ad'  # the file that will store the analysis results
scanpy==1.3.7 anndata==0.6.16 numpy==1.15.4 scipy==1.1.0 pandas==0.23.4 scikit-learn==0.20.2 statsmodels==0.9.0 python-igraph==0.7.1 louvain==0.6.1 
In [2]:
sc.settings.set_figure_params(dpi=80)
In [3]:
adata_1 = sc.read_10x_mtx(
    '../data/pbmc3k_filtered_gene_bc_matrices/hg19/',  # the directory with the `.mtx` file
    var_names='gene_symbols',                  # use gene symbols for the variable names (variables-axis index)
    cache=True)                                # write a cache file for faster subsequent reading
... reading from cache file ./cache/data-pbmc3k_filtered_gene_bc_matrices-hg19-matrix.h5ad
In [4]:
adata_2 = sc.read_10x_mtx(
    '../data/pbmc3k_filtered_gene_bc_matrices/hg19/',  # the directory with the `.mtx` file
    var_names='gene_symbols',                  # use gene symbols for the variable names (variables-axis index)
    cache=True)                                # write a cache file for faster subsequent reading
... reading from cache file ./cache/data-pbmc3k_filtered_gene_bc_matrices-hg19-matrix.h5ad

Following Instructions from MNNPY Repo

In [5]:
# import scanpy.api as sc
import mnnpy
In [8]:
corrected = mnnpy.mnn_correct(adata_1, adata_2)
Performing cosine normalization...
Starting MNN correct iteration. Reference batch: 0
Step 1 of 1: processing batch 1
  Looking for MNNs...
  Computing correction vectors...
  Adjusting variance...
  Applying correction...
MNN correction complete. Gathering output...
Packing AnnData object...
Done.
In [10]:
corrected[0]
Out[10]:
(AnnData object with n_obs × n_vars = 2700 × 32738 
     var: 'gene_ids', AnnData object with n_obs × n_vars = 2700 × 32738 
     var: 'gene_ids')
In [13]:
type(corrected)
Out[13]:
tuple
In [14]:
len(corrected)
Out[14]:
3
In [30]:
def conv_to_df(adata):
    genes = adata.var.index.tolist()
    barcodes = adata.obs.index.tolist()
    mat = adata.X.transpose()    
    df = pd.DataFrame(data=mat, columns=barcodes, index=genes)
    return df
In [32]:
df_1 = conv_to_df(corrected[0][0])
df_1.shape
Out[32]:
(32738, 2700)
In [33]:
df_2 = conv_to_df(corrected[0][1])
df_2.shape
Out[33]:
(32738, 2700)
In [34]:
df_1.equals(df_2)
Out[34]:
False
In [38]:
df_1.sum().head()
Out[38]:
AAACATACAACCAC-1    12.268233
AAACATTGAGCTAC-1    13.007872
AAACATTGATCAGC-1    11.362640
AAACCGTGCTTCCG-1    12.173033
AAACCGTGTATGCG-1    10.409757
dtype: float32
In [37]:
df_2.sum().head()
Out[37]:
AAACATACAACCAC-1    12.263125
AAACATTGAGCTAC-1    13.002718
AAACATTGATCAGC-1    11.355673
AAACCGTGCTTCCG-1    12.165644
AAACCGTGTATGCG-1    10.402437
dtype: float32
In [39]:
from clustergrammer2 import net
clustergrammer2 backend version 0.2.9
In [40]:
net.load_df(df_1)
net.filter_N_top(inst_rc='row', N_top=100, rank_type='var')
net.normalize(axis='row', norm_type='zscore')
net.clip(-5, 5)
net.widget()
In [41]:
net.load_df(df_2)
net.filter_N_top(inst_rc='row', N_top=100, rank_type='var')
net.normalize(axis='row', norm_type='zscore')
net.clip(-5, 5)
net.widget()
In [ ]: